Tech Arsenal 1

home *** CD-ROM | disk | FTP | other *** search

/ Tech Arsenal 1 / Tech Arsenal (Arsenal Computer).ISO / tek-13 / trs141f.zip / NFMT.ZIP / NFMT.C < prev next >

Wrap

C/C++ Source or Header | 1992-07-20 | 16KB | 681 lines

#include <stdio.h> #include <ctype.h> #include "config.h" #include "options.h" /* * NAME * nfmt - simple text formatter * * SYNOPSIS * usage: nfmt [ -cs ] [ -width ] [ -p prefix ] [ file ... ] * * DESCRIPTION * Nfmt is a simple text formatter, similar to the BSD program fmt(1), * but uses best-fit line breaking, by a simple version of * * "Breaking Paragraphs into Lines", * Donald E. Knuth and Michael F. Plass, * "Software--Practice and Experience" 11 (1981) 1119-1184. * * Tabs are expanded on input and re-introduced on output. * * OPTIONS * -c Crown margin mode. The indentation of the first line * of a paragraph must be different from the indentation * of the second. Subsequent lines must have the same * indentation as the second line. * * -s Split lines only. * * -width Maximum line width (default 75). Fmt prefers to make * lines about 7% shorter, to give it room to balance line * lengths. * * -p prefix * Only lines beginning with the prefix (possibly preceded * by white space) are re-arranged; the prefix (with any * preceding white space) is stripped for the formatting and * re-attached to each formatted output line. For example, * * nfmt -p ' * ' * * formats C comments laid out in the normal way, leaving * the code unchanged. The -p option may also be combined * with the other options. * * AUTHOR * Ross Paterson <rap@doc.ic.ac.uk> * Packaging and portability by Yossi Gil <yogi@cs.ubc.ca> */ /* * The following parameters represent the program's idea of what is "best". * Adjust to taste, subject to the caveats given. */ #define WIDTH 75 /* longest permitted line length */ #define LEEWAY 15 /* 1/LEEWAY of line is best left unused */ /* * Costs and bonuses are expressed as the equivalent departure from * the optimal line length, multiplied by 10. * e.g. assigning something a cost of 50 means that it is as bad as * a line 5 characters too short or long. * The definition of SHORT_COST(n) should not be changed. * However, EQUIV(n) may need tuning. */ typedef long COST; #define MAXCOST (~(((COST)1)<<(8*sizeof(COST)-1))) #define SQR(n) ((n)*(n)) #define EQUIV(n) SQR((COST)(n)) /* cost of a filled line longer or shorter than best_width */ #define SHORT_COST(n) EQUIV((n)*10) /* cost of the difference between adjacent filled lines */ #define RAGGED_COST(n) (SHORT_COST(n)/2) /* basic cost per line */ #define LINE_COST EQUIV(70) /* line break after the first word of a sentence */ #define WIDOW_COST EQUIV(200) /* line break before the last word of a sentence */ #define ORPHAN_COST EQUIV(150) /* line break at the end of a sentence */ #define SENTENCE_BONUS EQUIV(50) /* line break after close parenthesis */ #define PAREN_BONUS EQUIV(40) /* line break after other punctuation */ #define PUNCT_BONUS EQUIV(40) /* credit for breaking a long paragraph 1 line later */ #define LINE_CREDIT EQUIV(30) /* * Size of paragraph buffer, in words and characters. * Longer paragraphs are handled (cf. flush_paragraph()). */ #define MAXWORDS 1000 #define MAXCHARS 5000 /* * Extra ctype(3)-style macros */ #define isopen(c) (strchr("([`'\"", c) != NULL) #define isclose(c) (strchr(")]'\"", c) != NULL) #define isperiod(c) (strchr(".?!", c) != NULL) /* * Miscellaneous definitions */ #define reg register typedef int bool; #define TRUE 1 #define FALSE 0 #define until(c) if (c) break /* * Word descriptor structure */ #define WORD struct _WORD WORD { /* static attributes determined during input */ char *text; short length; /* length of this word */ short space; /* the size of the following space */ bool paren:1; /* starts with open paren */ bool period:1; /* ends in [.?!])* */ bool punct:1; /* ends in punctuation */ /* the remaining fields are computed during the optimization */ short line_length; /* length of the best line starting here */ COST best_cost; WORD *next_break; /* break which achieves best_cost */ }; /* * Forward declarations */ extern void trim_prefix ARGS((void)); extern void fmt ARGS((void)); extern bool get_paragraph ARGS((void)); extern int get_line ARGS((int c)); extern int get_prefix ARGS((void)); extern int get_space ARGS((int c)); extern int copy_rest ARGS((int c)); extern bool same_para ARGS((int c)); extern void flush_paragraph ARGS((void)); extern void fmt_paragraph ARGS((void)); extern void check_punctuation ARGS((WORD *w, char *finish)); extern COST base_cost ARGS((WORD *this)); extern COST line_cost ARGS((WORD *next, int len)); extern void put_paragraph ARGS((WORD *finish)); extern void put_line ARGS((WORD *w)); extern void put_prefix ARGS((int c)); extern void put_space ARGS((int space)); FILE *infile; /* * Option values and derived quantities */ bool crown = FALSE; bool split = FALSE; /* each line is a paragraph on its own */ char default_prefix[] = ""; char *prefix = default_prefix; int prefix_length; int prefix_lead_space; int prefix_full_length; int max_width = WIDTH; int best_width; main(argc, argv) reg int argc; reg char *argv[]; { reg int i; OPTIONS ("-cs -width -p prefix [ file ... ]") FLAG ('c', crown) FLAG ('s', split) NUM_OPT (max_width) STRING ('p', prefix) ENDOPTS best_width = max_width*(LEEWAY-1)/LEEWAY; trim_prefix(); if (argc == 1) { infile = stdin; fmt(); } else for (i = 1; i < argc; i++) if ((infile = fopen(argv[i], "r")) == NULL) fprintf(stderr, "%s: can't read file '%s'\n", argv[0], argv[i]); else { fmt(); fclose(infile); } return 0; } void trim_prefix() { reg char *s; prefix_lead_space = 0; while (*prefix == ' ') { prefix_lead_space++; prefix++; } prefix_full_length = strlen(prefix); s = prefix + prefix_full_length; while (s > prefix && s[-1] == ' ') s--; *s = '\0'; prefix_length = s - prefix; } int in_column = 0; int out_column = 0; char parabuf[MAXCHARS]; /* space for the paragraph text */ WORD word[MAXWORDS]; WORD *word_limit; char *wptr; int first_indent; /* indentation of first line */ int indent; /* indentation of rest of current paragraph */ int prefix_indent; int next_prefix_indent; bool tabs = FALSE; /* tabs in input? */ int next_char; /* one-character look-ahead */ char *to_match = ""; void fmt() { next_char = get_prefix(); while (get_paragraph()) { fmt_paragraph(); put_paragraph(word_limit); } } /* * Definitions. * * A <paragraph> is a maximal non-empty set of consecutive non-blank * lines at the same indent. * In split mode, a paragraph is a single non-blank line. * In crown mode, the second and subsequent lines must have the same * indentation, but differing from the first line. * If a prefix is in effect, it must also be at the same indent for * each line in the paragraph. * * A <word> is a maximal non-empty set of non-white characters. * * A <sentence break> is either the end of a paragraph or a word ending * in [.?!], possibly followed by ) or ], followed by a word beginning * with a capital. */ bool get_paragraph() { reg int c; c = next_char; /* * Scan (and copy) blank lines, * and lines not introduced by the prefix. */ while (c == '\n' || *to_match || next_prefix_indent < prefix_lead_space || in_column < next_prefix_indent + prefix_full_length) { if (prefix_length != 0) c = copy_rest(c); until(c == EOF); putchar('\n'); c = get_prefix(); } if (c == EOF) { next_char = EOF; return FALSE; } prefix_indent = next_prefix_indent; wptr = parabuf; word_limit = word; if (split) { first_indent = indent = in_column; c = get_line(c); } else if (crown) { first_indent = in_column; c = get_line(c); if (same_para(c) && in_column != first_indent) { indent = in_column; do { /* for each line till the end of the para */ c = get_line(c); } while (same_para(c) && in_column == indent); } /* * If only one line, use the secondary indent * from last time (initially 0) if it splits. */ } else { first_indent = indent = in_column; do { /* for each line till the end of the para */ c = get_line(c); } while (same_para(c) && in_column == indent); } (word_limit-1)->period = TRUE; next_char = c; return TRUE; } /* * Copy a line which failed to match the prefix to the output, * or which was blank after the prefix. * * In the former case, c is the character that failed to match *to_match. * In the latter, c is \n or EOF. * Returns the character ending the line. */ int copy_rest(c) reg int c; { reg char *s; out_column = 0; put_space(next_prefix_indent); for (s = prefix; s != to_match; s++) putchar(*s); if (c != '\n' && c != EOF) { out_column += to_match - prefix; put_space(in_column - out_column); do { putchar(c); c = getc(infile); } while (c != '\n' && c != EOF); } return c; } bool same_para(c) reg int c; { return next_prefix_indent == prefix_indent && *to_match == '\0' && c != '\n' && c != EOF; } /* * Read a line, given first non-blank character c, and the following * indent, returning the first non-blank character of the next line. */ int get_line(c) reg int c; { int start; reg char *end_of_parabuf; reg WORD *end_of_word; end_of_parabuf = ¶buf[MAXCHARS]; end_of_word = &word[MAXWORDS-2]; do { /* for each word in a line */ /* scan word */ if (islower(c) && word_limit != word) (word_limit-1)->period = FALSE; word_limit->text = wptr; do { if (wptr == end_of_parabuf) flush_paragraph(); *wptr++ = c; c = getc(infile); } while (c != EOF && ! isspace(c)); check_punctuation(word_limit, wptr-1); in_column += word_limit->length = wptr - word_limit->text; if (wptr == end_of_parabuf || word_limit == end_of_word) flush_paragraph(); *wptr++ = '\0'; /* scan inter-word space */ start = in_column; c = get_space(c); word_limit->space = in_column - start; word_limit++; if (c == EOF) return EOF; } while (c != '\n'); c = get_prefix(); (word_limit-1)->space = c != EOF && ! islower(c) && (word_limit-1)->period ? 2 : 1; return c; } int get_prefix() { reg int c; in_column = 0; c = get_space(getc(infile)); to_match = prefix; if (prefix_length == 0) next_prefix_indent = 0; else { next_prefix_indent = in_column; while (*to_match && c == *to_match) { in_column++; to_match++; c = getc(infile); } if (*to_match == '\0') c = get_space(c); } return c; } /* * Scan blank characters, keeping in_column up-to-date. */ int get_space(c) reg int c; { for (;;) { if (c == ' ') in_column++; else if (c == '\t') { tabs = TRUE; in_column = (in_column/8 + 1)*8; } else return c; c = getc(infile); } } void check_punctuation(w, finish) reg WORD *w; reg char *finish; { reg char *start; start = w->text; w->paren = isopen(*start); w->punct = ispunct(*finish); while (isclose(*finish) && finish > start) finish--; w->period = isperiod(*finish); } /* * On hitting the limit, flush part of the paragraph to make room. */ void flush_paragraph() { WORD saved_last_word; WORD *split_point; reg WORD *w; int shift; COST best_break; /* * In the special case where it's all one word, we just flush it. */ if (word_limit == word) { printf("%*s", wptr-parabuf, parabuf); wptr = parabuf; return; } /* * Otherwise: * - format what you have so far as a paragraph, * - find a low-cost line break near the end, * - output to there, * - make that the start of the paragraph. */ saved_last_word = *word_limit; fmt_paragraph(); /* choose a good split point */ split_point = word; best_break = MAXCOST; for (w = word->next_break; w != word_limit; w = w->next_break) { if (w->best_cost - w->next_break->best_cost < best_break) { split_point = w; best_break = w->best_cost - w->next_break->best_cost; } if (best_break <= MAXCOST - LINE_CREDIT) best_break += LINE_CREDIT; } if (split_point == word) { fprintf(stderr, "%s: panic\n", O_name); exit(1); } put_paragraph(split_point); *word_limit = saved_last_word; /* copy text of words down to start of parabuf */ memcpy(parabuf, split_point->text, wptr - split_point->text); shift = split_point->text - parabuf; wptr -= shift; /* adjust text pointers */ for (w = split_point; w <= word_limit; w++) w->text -= shift; /* copy words from split_point down to word */ memcpy((char *)word, (char *)split_point, (word_limit - split_point + 1)*sizeof(WORD)); word_limit -= split_point - word; } /* * Compute the optimal formatting for the whole paragraph by computing and * remembering the optimal formatting for each suffix from the empty one * to the whole paragraph. */ void fmt_paragraph() { reg WORD *start, *w; reg int len; reg COST wcost, best; word_limit->best_cost = 0; word_limit->length = max_width; /* sentinel */ for (start = word_limit-1; start >= word; start--) { best = MAXCOST; len = start == word ? first_indent : indent; /* at least one word, however long, in the line */ w = start; len += w->length; do { w++; /* consider breaking before w */ wcost = line_cost(w, len) + w->best_cost; if (wcost < best) { best = wcost; start->next_break = w; start->line_length = len; } len += (w-1)->space; /* w > start >= word */ len += w->length; } while (len < max_width); start->best_cost = best + base_cost(start); } } /* constant component of cost of breaking before this word */ COST base_cost(this) reg WORD *this; { reg COST cost; cost = LINE_COST; if (this > word) if ((this-1)->period) cost -= SENTENCE_BONUS; else if ((this-1)->punct) cost -= PUNCT_BONUS; else if (this > word+1 && (this-2)->period) cost += WIDOW_COST/((this-1)->length+2); if (this->paren) cost -= PAREN_BONUS; else if (this->period) cost += ORPHAN_COST/(this->length+2); return cost; } /* length-dependent component of cost of breaking at next */ COST line_cost(next, len) reg WORD *next; reg int len; { reg int n; reg COST cost; if (next == word_limit) return 0; n = best_width - len; cost = SHORT_COST(n); if (next->next_break != word_limit) { n = len - next->line_length; cost += RAGGED_COST(n); } return cost; } void put_paragraph(finish) reg WORD *finish; /* must be in the next_break chain from word */ { reg WORD *w; out_column = 0; put_prefix(first_indent); put_line(word); for (w = word->next_break; w != finish; w = w->next_break) { put_prefix(indent); put_line(w); } } void put_line(w) reg WORD *w; { reg WORD *endline; endline = w->next_break-1; out_column += w->length; fputs(w->text, stdout); while (w != endline) { put_space(w->space); w++; fputs(w->text, stdout); out_column += w->length; } putchar('\n'); } void put_prefix(space) int space; { out_column = 0; put_space(prefix_indent); fputs(prefix, stdout); out_column += prefix_length; put_space(space - out_column); } void put_space(space) int space; { reg int space_target, tab_target; space_target = out_column + space; if (tabs) { tab_target = space_target/8*8; if (out_column+1 < tab_target) while (out_column < tab_target) { putchar('\t'); out_column = (out_column/8 + 1)*8; } } while (out_column < space_target) { putchar(' '); out_column++; } }